/*******************************************************************************
																				
	DESCRIPTION: 	This do file creates the dataset of the unemployment spells.
	
*******************************************************************************/

clear all
global id_code 001_1

********************************************************************************
************************ 1. PES data: Sub-spells ******************************
********************************************************************************

********************************************************************************
* 1.1: PES data, sub-spells - rename the variables and merge the data
********************************************************************************

* Import raw data:
use "${path_PES}/AF_Sokatper.dta", clear

* Rename and label the variables
label var LopNr_PersonNr "personal identifier"

rename INLNR InLnr
label var InLnr "Identifier of unempl spell"
rename SOKATLNR SokatLnr
label var SokatLnr "Identifier of category spell (sub-spell)"
/* InLnr: Unemployment spell. Note that often there are multiple observations 
per unemployment spell, these are consecutive sub-spells of unemployment spell. 
People change the unemployment category over time within the same unemployment 
spell. For example, in a single unemployment spell, one can spend 40% of the 
time as category X and the rest as category Y. */

rename SKAT Skat
label var Skat "Sub-spell category"
rename NYSKAT NySkat
label var NySkat "New category"
/* The current category (X) is denoted by Skat, the category of the next 
sub-spell (Y) by NySkat. There are documents available from PES explaining the 
different categories. For example, Skat 11 means openly unemployed, with UI 
benefits and no wage.*/

rename INSKA_DAT InSka_Dat
label var InSka_Dat "Date Skat category begins"

rename UTSKA_DAT UtSka_Dat
label var UtSka_Dat "Date Skat category ends"

* Keep only the relevant variables.
* It is unclear what INTR_DAT is, it does not always correspond to the start of 
* unemployment spells.
* ANTDGR is the days spent in unemployment, but it's a string with the last 
* character of the string missing, which leads to discrepancy between this variable 
* and UtSka_Dat-InSka_Dat
drop INTR_DAT ANTDGR

* We destring/tostring variables to be able to append the vy_Sok dataset:
destring InLnr SokatLnr, replace
tostring Skat, replace

* Combine the datasets with spells from different years.
append using "${path_PES}/vy_Sok.dta" , keep(LopNr_PersonNr InLnr SokatLnr Skat NySkat InSka_Dat UtSka_Dat)
destring Skat NySkat, replace // back to destring variables

* Change the format of the dates:
gen searchStart = date(InSka_Dat,"YMD") // "Date of initiation of search category" - Meaning date at which category (Skat) changes
label var searchStart "Date Skat category begins"

gen searchEnd = date(UtSka_Dat,"YMD") // "Date of completion of search category" - Meaning date until which person belonged to a certain category (Skat)
label var searchEnd "Date Skat category ends"

drop InSka_Dat UtSka_Dat
format searchStart searchEnd %td

********************************************************************************
* 1.2: PES data, sub-spells - eliminate duplicates
********************************************************************************

* Drop exactly the same observations that may be duplicated in the two datasets:
duplicates drop // 5 638 825 observations deleted

* Drop all unemployment spells for we which have at least one sub-spell with incorrect dates:
gen test=(searchStart>searchEnd)
tab test, missing // there are 617 333 observations with negative duration
bys Lop* InLnr: egen test2=max(test) // drop the whole spell is at least one sub-spell has incorrect dates
drop if test2==1 // 5 454 976 observations deleted (approx. 1 %)
drop test test2

* Are there any sub-spells for the same person, unemployment spell ID, sub-spell ID, start end date?
duplicates report Lop* InLnr SokatLnr searchStart searchEnd // There are duplicates in terms of these variables (surplus of 1 observation)

pause off
preserve
bys Lop* InLnr SokatLnr searchStart searchEnd: keep if _N>1
pause: kept duplicates // The remaining duplicate is for person with ID 7227982
restore

preserve 
keep if LopNr_PersonNr==7227982
pause: kept only the person of interest
restore 
/* The observation that is a duplicates seems to be a mistake. It has the same 
dates as the previous sub-spell. There is another observation with the same 
Skat but correct dates. We drop the incorrect observation*/
drop if LopNr_PersonNr==7227982 & SokatLnr==1 & NySkat==. & Skat==14

* Drop if for duplicate observations, the observations are the same except 
* that one has NySkat and searchEnd missing
bys Lop* InLnr SokatLnr Skat searchStart: drop if NySkat==. & searchEnd==. & _N>1 // 575 737 observations deleted

duplicates report Lop* InLnr SokatLnr searchStart  // There are still some duplicates in terms of these variables (surplus of 1728)
pause off
preserve
bys Lop* InLnr SokatLnr searchStart : gen test=1 if _N>1
by Lop* InLnr: egen test2=max(test)
keep if test2==1
pause: kept duplicates // 
restore

* Drop if for duplicate observations, the observations are the same except 
* that the start date differs and one has NySkat and searchEnd missing
bys Lop* InLnr SokatLnr Skat: drop if NySkat==. & searchEnd==. & _N>1 // 2 624 observations deleted

* Drop duplicate observations when one observation has end date that equals 
* the start date of the following observation and the other one doesn't:
gsort Lop* InLnr SokatLnr searchStart searchEnd
bys Lop* InLnr: gen nextStart=searchStart[_n+1] if NySkat==Skat[_n+1]
gsort Lop* InLnr SokatLnr searchStart -searchEnd
bys Lop* InLnr: replace nextStart=searchStart[_n+1] if NySkat==Skat[_n+1]
by Lop* InLnr SokatLnr searchStart: drop if searchEnd!=nextStart & _N>1 // 1 796 observations deleted
drop nextStart
* We no longer have duplicates in terms of Lop* InLnr SokatLnr searchStart

duplicates report LopNr_PersonNr InLnr SokatLnr // We still have duplicates in terms of these variables, surplus of 12k

preserve
bys Lop* InLnr SokatLnr : gen test=1 if _N>1
by Lop* InLnr: egen test2=max(test)
keep if test2==1
pause: kept duplicates // 
restore

* Drop if two observations have the same Lop* InLnr SokatLnr, but one has 
* NySkat missing
bys Lop* InLnr SokatLnr : drop if NySkat==. & _N>1 // 9 138 observations deleted

* Drop observations that are still duplicates
bys Lop* InLnr SokatLnr : gen test=1 if _N>1
by Lop* InLnr: egen test2=max(test)
drop if test2==1 //23 706 observations deleted
drop test test2

duplicates report LopNr_PersonNr InLnr SokatLnr

********************************************************************************
* 1.3: PES data, sub-spells - identify the initial Skat and keep only those that start in selected categories
********************************************************************************

* For each unemployment spell (denoted by InLnr), define which was the initial category (Skat) of the spell:											
bys Lop* InLnr : egen minSokat=min(SokatLnr)
g initskat=Skat if SokatLnr==minSokat
bys Lop* InLnr: egen initialSkat=mean(initskat)
drop minSokat initskat
label var initialSkat "Skat at the start of unempl spell"
tab initialSkat

* We save the dataset with all sub-spells for summary statistics.
save "${data}/${id_code}_UnemploymentSubspells_Initial", replace

use "${data}/${id_code}_UnemploymentSubspells_Initial", clear

* We keep only spells in the PES data that start with Skat = 11, 12, 13, 14, 34, 74, 76, 96, 97, and 98.
* These are the spells that we consider unemployment

* At this stage there are 17,649,741 unique spells (if we drop subspells in the same spell)
count if (initialSkat >= 11 & initialSkat <= 14) | initialSkat==34 |			///
initialSkat==74 | initialSkat==76 | (initialSkat>=96 & initialSkat<=98) 
* There are 13,197,717 spells that start in one of the categories that we consider unemployment

keep if (initialSkat >= 11 & initialSkat <= 14) | initialSkat==34 |			///
initialSkat==74 | initialSkat==76 | (initialSkat>=96 & initialSkat<=98) 
* 4,452,024 spells deleted

save "${data}/${id_code}_UnemploymentSubspells_Final", replace

********************************************************************************
************************ 2. PES data: Spells ***********************************
********************************************************************************

********************************************************************************
* 2.1: PES data, spells - rename the variables and merge the data
******************************************************************************** 

use "${path_PES}/AF_Insper.dta", clear // this is a dataset with spells of unemployment

* Renaming the variables
label var LopNr_PersonNr "personal identifier"

rename INLNR InLnr
label var InLnr "Identifier of unempl spell"

rename AV_DAT Av_Dat
label var Av_Dat "Date leave unempl office"

rename AKT_DAT Akt_Dat
label var Akt_Dat "Date register at the unempl office"

rename AVORS AvOrs
label var AvOrs "Reason for leaving PES"

* Keep only the relevant variables
* We drop INTR_DAT UTTR_DAT which we don't know what they means
* We drop AVORS reason for leaving unemployment, KASNR indicating cash 
* assistance and ILOKS inetr local candidat
keep LopNr_PersonNr InLnr Av_Dat Akt_Dat AvOrs

destring InLnr, replace

append using "${path_PES}/Insper.dta", keep(LopNr_PersonNr InLnr Av_Dat Akt_Dat AvOrs)

* Change the format of the dating variables
gen startU=date(Akt_Dat,"YMD") // "Date at which you register in the employment office"
label var startU "Date register at unempl office"

gen year = year(startU)
label var year "Year when register at unempl office"

gen outU=date(Av_Dat,"YMD") // "Date at which you exit the unemployment service"
label var outU "Date leave unempl office"

format startU outU %td
drop Akt_Dat Av_Dat

********************************************************************************
* 2.2: PES data, spells - eliminate duplicates
********************************************************************************

duplicates drop // 808 419 observations deleted

* Drop duplicate observations if one of them has outU missing
bys Lop* InLnr startU year: drop if outU==. & _N>1 // 562 429 observations deleted

* Compare the start date of first sub-spell and the end date of the last 
* sub-spell to eliminate duplicates in the spell data
pause off
preserve
use "${data}/${id_code}_UnemploymentSubspells_Initial", clear
sort Lop* InLnr SokatLnr
by Lop* InLnr: gen startDate_init=searchStart if _n==1
by Lop* InLnr: gen endDate_init=searchEnd if _n==_N
pause
by Lop* InLnr: egen startDate=mean(startDate_init)
by Lop* InLnr: egen endDate=mean(endDate_init)
pause
keep Lop* InLnr startDate endDate
format startDate endDate %td

duplicates drop LopNr_PersonNr InLnr, force
tempfile temp
save `temp'
restore

merge m:1 Lop* InLnr using `temp'
keep if _merge==3
drop _merge


bys Lop* InLnr: drop if startU!=startDate & _N>1 // 8 932 observations deleted
bys Lop* InLnr: drop if outU!=endDate & _N>1 // 159 observations deleted

bys Lop* startU: drop if startU!=startDate & _N>1 // 1 405 074  observations deleted
bys Lop* startU: drop if outU!=endDate & _N>1 // 121 observations deleted


duplicates report Lop* InLnr // no more duplicates
duplicates report Lop* startU // there is a surplus of about 170 K

* Drop if spells otherwise the same in terms of Lop* and start dates
* but one is missing outU
bys Lop* startU year: drop if outU==. & _N>1 // 443 observations deleted

* Drop if start date later than end date
drop if startU>outU // 177 observations deleted

sort Lop* InLnr

bys Lop* startU: gen dup=cond(_N==1,0,_n)
pause on
preserve
keep if dup>0
pause
restore
by Lop*: egen dup2=mean(dup)
pause on
preserve
keep if dup2>0
pause
restore
* It is not obvious which of the duplicate spells is correct

*Save
save "${data}/${id_code}_UnemploymentSpells_Save", replace

********************************************************************************
* 2.3: Merge the dataset with spells and the one with sub-spells
********************************************************************************

* Merge with the dataset with unemployment sub-spells:
merge 1:m Lop* InLnr using "${data}/${id_code}_UnemploymentSubspells_Final"
keep if _merge == 3
drop _merge	

drop startDate endDate

sort Lop* InLnr SokatLnr

pause on
preserve
keep if dup2>0
pause
restore

* Drop spells that are a duplicate of another spell in terms of Lop* startU searchStart Skat, but have NySkat missing
bys Lop* startU searchStart Skat: drop if NySkat==. & _N>1 // 173 375 observations deleted

save "${data}/${id_code}_UnemploymentSpells_temp.dta", replace	

* There are still duplicate spells that are for the same person and start on the same date, we leave them for now

********************************************************************************
* 2.4: PES data, spells - generate end of unemployment spell 
********************************************************************************
use "${data}/${id_code}_UnemploymentSpells_temp.dta", clear

/* Creating a variable for the end of the unemployment spell. This is defined 
as either end of the unemployment spell (indicated by outU) or a change in 
category towards one of the non-fully unemployed categories (indicated by 
NySkat).

There are two definitions of unemployment that we consider. The first 
definition does not count transition into training as the end of unemployment
 (i.e., NySkat that counts as end of unemployment are categories that are
 neither considered undemployment nor training)
*/
 
* This first definition of the end of an unemployment spell does not count a 
* transition to training as the end of a spell.
gen trueEnd = outU
replace trueEnd = searchEnd if (NySkat >= 21 & NySkat <= 33 )|(NySkat >= 35 & NySkat <= 51 ) ///
| NySkat == 53 | NySkat == 56 | NySkat == 58 | NySkat == 77 | NySkat == 78  | NySkat == 84  
label var trueEnd "True end date of the unemployment spell (training counted as unemployment)"

* This second definition of the end of an unemployment spell counts a 
* transition to training as the end of a spell.
gen trueEnd2 = outU
replace trueEnd2 = searchEnd if (NySkat >= 21 & NySkat <= 33 )|				///
	(NySkat >= 35 & NySkat <= 73 ) | NySkat == 75 | 						///
	(NySkat >= 77 & NySkat <= 88) | NySkat == 84 | NySkat == 99
label var trueEnd2 "True end date of the unemployment spell (not counting training as unemployment)"

format trueEnd trueEnd2 %td
	
* Save for descriptive statistics
save "${data}/${id_code}_UnemploymentSubspells", replace

********************************************************************************
* 2.5: Define training and other ALMPs
********************************************************************************

use "${data}/${id_code}_UnemploymentSubspells", clear

* Generate old definition of training
gen training = (Skat==52 | Skat==54 | Skat==55 | Skat==57 |					///
	(Skat>=59 & Skat<=73) | Skat==75 | (Skat>=79 & Skat<=83) 				///
	| (Skat>=85 & Skat<=88) | Skat==99)

* Generate a variable indicating that a sub-spell was vocational training
* (this info is from "Skat Master File" in Dropbox)
gen voc_training = 0
replace voc_training = 1 if inlist(Skat, 32, 64, 68, 69, 76, 81, 82, 84, 86, 87, 88)

* Generate a variable indicating that a sub-spell was non-vocational training
gen nonvoc_training = 0
replace nonvoc_training = 1 if inlist(Skat, 28, 71, 73, 74, 76, 80, 83)

* The following categories change over time, so need to be assigned more carefully
replace nonvoc_training = 1 if Skat==72 & searchStart>2005 
// Note: first definition used until 2001, second used from 2010. 
// Here 2005 is chosen arbitrarily, but anthing else between 2002-2009 works

* Generate a variable including both types of training
gen training_combined = 0
replace training_combined = 1 if voc_training==1 | nonvoc_training==1

* Generate a variable indicating that a sub-spell was work experience
gen work_experience = 0
replace work_experience = 1 if inlist(Skat, 52, 54, 55, 59, 60, 61, 62, 63, 67)
replace work_experience = 1 if inlist(Skat, 15, 57, 70, 72, 85)

* Generate a variable indicating that a sub-spell was "workfare employment"
gen workfare = 0
replace workfare = 1 if inlist(Skat, 65, 66, 79, 99)

* The following categories change over time, so need to be assigned more 
* carefully
replace workfare = 1 if Skat==50 & searchStart>2015
replace workfare = 1 if Skat==51 & (searchStart<2005 | searchStart>2010)
replace workfare = 1 if Skat==56 & searchStart<2005

* Generate a variable indicating that a sub-spell was subsidized work
gen subs_work_notdisabled = 0
replace subs_work_notdisabled = 1 if inlist(Skat, 30, 33, 36, 37, 40, 44, 45, ///
	47, 48, 49, 51, 53, 58, 77, 78)
	
* The following categories change over time, so need to be assigned more carefully
replace subs_work_notdisabled = 1 if Skat==50 & searchStart<2015
replace subs_work_notdisabled = 1 if Skat==58 & searchStart<2005
	
* Generate a variable indicating that a sub-spell was subsidized work for the disabled
gen subs_work_disabled = 0
replace subs_work_disabled = 1 if inlist(Skat, 38, 39, 42, 43)

* The following categories change over time, so need to be assigned more carefully
replace subs_work_disabled = 1 if Skat==56 & searchStart>2005
replace subs_work_disabled = 1 if Skat==58 & searchStart>2005
	
* Generate a variable indicating that a sub-spell was under the start-up incentive
gen startup_incentive = 0
replace startup_incentive = 1 if Skat==46
		
* Generate a variable indicating that a sub-spell was part of any active labor
* market policy (ALMP)
gen ALMP = 0
replace ALMP = 1 if (training_combined==1 | work_experience==1 | workfare==1 |	///
		subs_work_notdisabled==1 | subs_work_disabled==1 | startup_incentive==1)

* Generate a variable with the start date of the sub-spell if the sub-spell is x
foreach x in training voc_training nonvoc_training training_combined work_experience workfare subs_work_notdisabled subs_work_disabled startup_incentive ALMP {
	gen `x'_date_temp = .
	replace `x'_date_temp = searchStart if `x'==1
	format `x'_date_temp %td
}

* For every spell generate a variable indicating the start date of the first x sub-spell
sort Lop* InLnr searchStart	
foreach x in training voc_training nonvoc_training training_combined work_experience workfare subs_work_notdisabled subs_work_disabled startup_incentive ALMP {
	bys Lop* InLnr: egen `x'_date = min(`x'_date_temp)
	format `x'_date %td
}
		
* For every spell generate a variable indicating that the spell included at least one x sub-spell	
foreach x in training voc_training nonvoc_training training_combined work_experience workfare subs_work_notdisabled subs_work_disabled startup_incentive ALMP {
	bys Lop* InLnr: egen any_`x' = max(`x')
}

* Calculate the day of the SPELL on which the first SUB-SPELL of each category
* starts
foreach category in training voc_training nonvoc_training training_combined work_experience workfare subs_work_notdisabled subs_work_disabled startup_incentive ALMP {
	gen `category'_start = `category'_date + 1 - startU
}

* Create a dummy variable for each category for whether or not the spell entered
* that category within its first 180 days
foreach category in training voc_training nonvoc_training training_combined work_experience workfare subs_work_notdisabled subs_work_disabled startup_incentive ALMP {

	gen `category'_in6M = inrange(`category'_start, 0, 180)

}

drop training voc_training nonvoc_training training_combined 				///
	work_experience workfare subs_work_notdisabled subs_work_disabled		///
	startup_incentive ALMP													///
	training_date* voc_training_date* nonvoc_training_date* 				///
	training_combined_date* work_experience_date* workfare_date* 			///
	subs_work_notdisabled_date* subs_work_disabled_date*					///
	startup_incentive_date* ALMP_date*										///
	training voc_training_start nonvoc_training_start 						///
	training_combined_start work_experience_start workfare_start 			///
	subs_work_notdisabled_start	subs_work_disabled_start 					///
	startup_incentive_start ALMP_start	

save "${data}/${id_code}_UnemploymentSubspells_2", replace	
	
********************************************************************************
* 2.6: PES data - collapse the merged data to have one observation per spell
********************************************************************************

use "${data}/${id_code}_UnemploymentSubspells_2", clear

/* Collapsing data so to have one row being a full unemployment spell, instead
 of several sub-categories per unemployment spell. 
	- keep the mean of startU and year (deafult)
	- keep the earliest trueEnd and trueEnd2

Given that we only have one start date for unemployment spell we can take a 
mean of startU and year. */

so Lop* InLnr searchStart
collapse startU year any_training training_in6M any_training_combined training_combined_in6M any_ALMP ALMP_in6M (min) trueEnd trueEnd2, by(LopNr_PersonNr InLnr)

* Generate variable indicating duration of unemployment
generate duration = trueEnd-startU
label var duration "Duration of unemployment spell in days (counting training as unempl)"
generate duration2 = trueEnd2 - startU
label var duration2 "Duration of unemployment spell in days (not counting training as unempl)"

* Drop observations with negative duration
count if duration<0 & duration!=.
drop if duration<0 & duration!=. // 23 356 observations dropped
count if duration2<0 & duration2!=.
* there are still 9 806 with negative duration2, but we don't drop them 
* because duration2 is only an exploratory variable

duplicates report Lop* startU // surplus of 8K observations
* It is unclear what's the reasons for the duplicates

* For the remaining duplicates, we drop both spells
bys Lop* startU: gen dup=cond(_N==1,0,_n)
drop if dup>0
drop dup

compress
count // 12 064 745
save "${data}/${id_code}_UnemploymentSpells.dta", replace

********************************************************************************
* 2.7: PES data - calculating the number of unemployment spells in previous years
********************************************************************************

use "${data}/${id_code}_UnemploymentSpells.dta", clear

preserve
keep LopNr_PersonNr
duplicates drop
gen test=1
tempfile tempIDs
save `tempIDs'
restore


append using `tempIDs'

* Generate variable that takes value of 1 if a person is unemployed in a particular year
forvalues year0 = 1985/2017 {
	
	gen unemplYear`year0'=(year(startU)==`year0')
	replace unemplYear`year0'=1 if (year(startU)<`year0' & year(trueEnd)>=`year0')
	
	* For every person generate a variable indicating that they experience a spell of at least 6 months
	gen LT_spell`year0' = (duration>6*30 & duration!=.)
	
}

pause off
forvalues year0 = 1990/2017 {	
	
	preserve
	
	local year1 =`year0'-1
	local year2 =`year0'-2
	local year3 =`year0'-3
	local year4 =`year0'-4
	local year5 =`year0'-5

	keep if (unemplYear`year1'==1 | unemplYear`year2'==1 | unemplYear`year3'==1 | unemplYear`year4'==1 | unemplYear`year5'==1 | test==1)	
	
	forval Y = 5(-1)1 {
		
		bysort LopNr_PersonNr : egen any_LT_spells_`Y'Y`year0' = max(LT_spell`year0')
		* For every person count the number of unemployment spells in the preceeding Y years
		bysort LopNr_PersonNr : egen unemplSpells`Y'Ybefore`year0'temp=count(LopNr_PersonNr)
		* Subtract 1, which is the empty observation that only includes the ID of the person
		gen unemplSpells`Y'Ybefore`year0'=unemplSpells`Y'Ybefore`year0'temp-1
		drop unemplSpells`Y'Ybefore`year0'temp
		
		drop if unemplYear`year`Y''==1
	
	}
	
	keep LopNr_PersonNr unemplSpells* any_LT_spells_1Y`year0' any_LT_spells_2Y`year0' any_LT_spells_3Y`year0' any_LT_spells_4Y`year0' any_LT_spells_5Y`year0'
	pause
	* Keep one observation per person
	duplicates drop LopNr_PersonNr, force
	
	tempfile temp`year0'
	save `temp`year0''
	
	restore
}

* Merge the information about the number of unemployment spells in the last Y years for each years
forvalues year = 1990/2017 {
	merge m:1 LopNr_PersonNr using `temp`year''
	drop if _merge==2
	drop _merge
}

keep Lop* InLnr startU trueEnd unemplSpells* any_LT*

* Reshape the data to have the number of unemployment spells in every year for every unemployemnt spells	
reshape long unemplSpells5Ybefore unemplSpells4Ybefore unemplSpells3Ybefore unemplSpells2Ybefore unemplSpells1Ybefore any_LT_spells_5Y any_LT_spells_4Y any_LT_spells_3Y any_LT_spells_2Y any_LT_spells_1Y, i(LopNr_PersonNr InLnr) j(year)
* Keep if the year of the unemployment spell corresponds to the variable that 
* Indicates the number of the unemployment spells in the 2 and 5 years preceeding the year
keep if year==year(startU)
duplicates drop Lop* InLnr, force
keep LopNr* InLnr startU unempl* any_LT_spells*

tempfile tempUnempSpells
save `tempUnempSpells'


use "${data}/${id_code}_UnemploymentSpells.dta", clear

merge m:1 LopNr* InLnr startU using `tempUnempSpells'
drop if _merge==2
drop _merge

save "${data}/${id_code}_UnemploymentSpells.dta", replace

********************************************************************************
* 2.8: Creating outcome variables 
********************************************************************************

use "${data}/${id_code}_UnemploymentSpells.dta", clear

* Identifying individuals who have been unemployed for X months
forvalues i = 0/12{

	gen unempl`i'M_In = (`i'*30 <= duration) if duration != . 
	label var unempl`i'M_In "Unemployed for at least `i' months"

}

* Replacing missing observations for those for whom we would have observed 
* the end of being unemployed if they stropped being uneployed within the required period
	
forvalues i = 0/12{
	replace unempl`i'M_In = 1 if startU!=. & trueEnd==. & (startU + `i'*30 <= d(31dec2017))
}

* For those who have been unemployed for X months, did they find a job after 3/6 months?

forvalues i = 0/12{

	gen emplAft3M_`i'M_In = (`i'*30 + 90 > duration) if duration != . & ///
		unempl`i'M_In == 1 
	label var emplAft3M_`i'M_In "Cond. of min. `i' months of unempl., employed after 3 months"
	
	gen emplAft6M_`i'M_In = (`i'*30 + 180 > duration) if duration != . & ///
		unempl`i'M_In == 1 
	label var emplAft6M_`i'M_In "Cond. of min. `i' months of unempl., employed after 6 months"

}

	gen emplAft1M_0M_In = (30 > duration) if duration != . & ///
		unempl0M_In == 1 
	label var emplAft1M_0M_In "Cond. of min. 0 months of unempl., employed after 1 month"

	gen emplAft12M_0M_In = (360 > duration) if duration != . & ///
		unempl0M_In == 1 
	label var emplAft12M_0M_In "Cond. of min. 0 months of unempl., employed after 12 months"
	
	
* Replacing missing observations for those for whom we would have observed 
* The beginning of employment if they became employed within the required period

forvalues i = 0/12{ 
	replace emplAft3M_`i'M_In = 0 if startU!=. & trueEnd==. & (startU + `i'*30 + 90 <= d(31dec2017))

	replace emplAft6M_`i'M_In = 0 if startU!=. & trueEnd==. & (startU + `i'*30 + 180 <= d(31dec2017))

}

replace emplAft1M_0M_In = 0 if startU!=. & trueEnd==. & (startU + 30 <= d(31dec2017))

replace emplAft12M_0M_In = 0 if startU!=. & trueEnd==. & (startU + 360 <= d(31dec2017))

compress

save "${data}/${id_code}_UnemploymentSpells.dta", replace

********************************************************************************
* 3.1: PES data - checking when data starts and ends
********************************************************************************
* Check when data starts and ends, i.e., from what timeline info about 
* unemployment spells is included in the data

use "${data}/${id_code}_UnemploymentSpells.dta", clear

* Check the number of spell ends by year
gen trueEndYear=year(trueEnd)
histogram trueEndYear, frequency graphregion(color(white)) 			///
	ytitle("Number of unemployment spells ends by year") width(1) xlabel(1968(4)2021, angle(vertical)) xtitle("")
graph export "$output/${id_code}_NumberUnemploymentSpellEnds.pdf", as(pdf) replace

* Check the number of spell ends by year from 1991
histogram trueEndYear if trueEndYear>=1991 , frequency graphregion(color(white)) 			///
	ytitle("Number of unemployment spells ends by year") width(1) xlabel(1991(1)2021, angle(vertical)) xtitle("")
graph export "$output/${id_code}_NumberUnemploymentSpellEnds_from1991.pdf", as(pdf) replace

* Zoom in at the start of the timeline
histogram trueEnd if trueEnd>d(01jan1991) & trueEnd<d(31dec1992), frequency graphregion(color(white)) 			///
	ytitle("Number of unemployment spells ends by year") xtitle("")
graph export "$output/${id_code}_NumberUnemploymentSpellEnds_zoomin1991_1992.pdf", as(pdf) replace

* Zoom in at the end of the timeline
histogram trueEnd if trueEnd>d(01jan2016) & trueEnd<d(31dec2018), frequency graphregion(color(white)) 			///
	ytitle("Number of unemployment spells ends by year") xtitle("")
graph export "$output/${id_code}_NumberUnemploymentSpellEnds_zoominOnOrAfter2016and2018.pdf", as(pdf) replace

* Check the number of spell starts by year
gen startUYear=year(startU)
histogram startUYear, frequency graphregion(color(white)) 			///
	ytitle("Number of unemployment spells starts by year") width(1) xlabel(1900(4)2021, angle(vertical)) xtitle("")
graph export "$output/${id_code}_NumberUnemploymentSpellStarts.pdf", as(pdf) replace

* Check the number of spell starts by year from 1991
histogram startUYear if startUYear>=1991, frequency graphregion(color(white)) 			///
	ytitle("Number of unemployment spells starts by year") width(1) xlabel(1991(1)2021, angle(vertical)) xtitle("")
graph export "$output/${id_code}_NumberUnemploymentSpellStarts_from1991.pdf", as(pdf) replace

* Zoom in at the start of the timeline
histogram startU if startU>d(01jan1991) & startU<d(31dec1992), frequency graphregion(color(white)) 			///
	ytitle("Number of unemployment spells starts by year") xtitle("")
graph export "$output/${id_code}_NumberUnemploymentSpellStarts_zoomin1991.pdf", as(pdf) replace

* Zoom in at the end of the timeline
histogram startU if startU>d(01jan2016), frequency graphregion(color(white)) 			///
	ytitle("Number of unemployment spells starts by year") xtitle("")
graph export "$output/${id_code}_NumberUnemploymentSpellStarts_zoominOnOrAfter2016.pdf", as(pdf) replace

* Plot the share of unemployed who found a job 0-4 years after becoming unemployed by year
forvalues i=0/4 {
	gen jobYear_plus`i'=(trueEndYear==startUYear+`i')
	bys startUYear: egen share_jobYear_plus`i'=total(jobYear_plus`i')
	by startUYear: replace share_jobYear_plus`i'=share_jobYear_plus`i'/_N

	preserve
	duplicates drop startUYear, force

	scatter share_jobYear_plus`i' startUYear if startUYear>=(1991) & startUYear<(2018), graphregion(color(white)) 			///
		ytitle("Share that found a job in the year `i'" "following the start of unempl by year") xlabel(1991(1)2017, angle(vertical)) 
	graph export "$output/${id_code}_Share_found_job_year`i'.pdf", as(pdf) replace

	restore
}

